#setwd('/afs/inf.ed.ac.uk/user/s17/s1725186/Documents/PhD-Models/FirstPUModel/RMarkdowns')

library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally) ; library(gplots)
library(WGCNA)
library(expss)
library(polycor)
library(knitr)

SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}

clustering_selected = 'DynamicHybridMergedSmall'
#clustering_selected = 'DynamicHybrid'

cat(paste0('Using clustering ', clustering_selected))
## Using clustering DynamicHybridMergedSmall

Load preprocessed dataset (preprocessing code in 19_10_14_data_preprocessing.Rmd) and clustering (pipeline in 19_10_15_WGCNA.Rmd)

# Gandal dataset
load('./../Data/Gandal/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame


# GO Neuronal annotations
GO_annotations = read.csv('./../../../PhD-InitialExperiments/FirstYearReview/Data/GO_annotations/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID'=as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal'=1)


# SFARI Genes
SFARI_genes = read_csv('./../Data/SFARI/SFARI_genes_08-29-2019_with_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]


# Update DE_info with SFARI and Neuronal information
genes_info = DE_info %>% mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>% 
  mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`)) %>%
  left_join(GO_neuronal, by='ID') %>% 
  mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
  mutate(gene.score=ifelse(`gene-score`=='None' & Neuronal==1, 'Neuronal', `gene-score`), 
         significant=padj<0.05 & !is.na(padj))


# Dataset created with DynamicTreeMerged algorithm
dataset = read.csv(paste0('./../Data/Gandal/dataset_', clustering_selected, '.csv'))
dataset$Module = dataset[,clustering_selected]


# GO DEA
# load('./../Data/Gandal/GO_DE_clusters.RData')

rm(DE_info, GO_annotations, clusterings)

Check if for each gene, the module with the smallest distance is their corresponding module

mm = dataset %>% dplyr::select(starts_with('MM.'), starts_with('MMgray')) %>% dplyr::rename(MM.gray = MMgray)
rownames(mm) = dataset$ID
original_max_membership = gsub('MM.', '', colnames(mm)[max.col(mm, ties.method='first')])

cat(paste0('For ', round(100*mean(original_max_membership == gsub('#','',dataset$Module))),
           '% of the genes, their assigned module corresponds to the module with the highest Module Membership'))
## For 53% of the genes, their assigned module corresponds to the module with the highest Module Membership

Apparently this is not the case. Someone asked this in a Bioconductor question and Peter Langfelder answered that this is because WGCNA assigns module labels using dynamic tree cut of hierarchical clustering tree that is based on the Toplogical Overlap Measure. TOM results in similar but not quite the same similarity as correlation, hence for some genes the assigned module may differ from the module with highest kME. But that in all, he doesn’t worry about the module assignment vs. max. kME differences in his own analyses, and he recommends not worrying about it to others as well.

Before following his advice and stop worrying about it I’m going to try to understand better when these two modules don’t match

The palette on the top of the heatmap represents the size of the module, the darker the colour, the larger the module

module_size = dataset %>% group_by(Module) %>% tally %>% mutate(Module = gsub('#', '', Module))
module_size$quant = cut(module_size$n, breaks=9, labels=FALSE)
module_size$meanExpr = sapply(module_size$Module, function(m){mean(rowMeans(datExpr)[dataset$Module==m | dataset$Module==paste0('#',m)])})
module_size$quantME = cut(module_size$meanExpr, breaks=9, labels=FALSE)

heatmap.2(table(original_max_membership, gsub('#','',dataset$Module)), symm=TRUE, dendrogram='none', keysize=1,
          trace='none', scale='none', col=brewer.pal(9,'YlGnBu'), xlab='Assigned Module',  ylab='Highest MM',
          ColSideColors = brewer.pal(9,'PuBu')[module_size$quant])

Scaling by rows it’s easier to see that genes assigned to the larger modules sometimes have a higher module membership with smaller modules

heatmap.2(table(original_max_membership, gsub('#','',dataset$Module)), dendrogram='none', keysize=1, symm=TRUE,
          trace='none', scale='row', col=brewer.pal(9,'YlGnBu'), xlab='Assigned Module',  ylab='Highest MM',
          ColSideColors=brewer.pal(9,'PuBu')[module_size$quant])

Module Membership by gene (selecting a random sample so it’s not that heavy)

The membership of some module seems to be related to the level of expression of the genes

set.seed(123)
plot_mm = mm %>% sample_frac(0.05) %>% as.matrix
colnames(plot_mm) = gsub('MM.','', colnames(plot_mm))
plot_mm = plot_mm[order(rowMeans(plot_mm)), module_size$Module[order(module_size$n)]]

heatmap.2(plot_mm, xlab('Modules ordered by Size'), ylab('Genes ordered by mean expression'), keysize=1,
          ColSideColors = brewer.pal(9,'PuBu')[module_size$quant[order(module_size$n)]],
          RowSideColors = brewer.pal(9, 'RdPu')[cut(rowMeans(plot_mm), breaks=9, labels=FALSE)],
          trace='none', dendrogram='none', scale='none', Rowv=FALSE, Colv=FALSE, col=brewer.pal(9,'Spectral'))

Letting the heatmap order the genes and modules by distance, it seems like the module size is not an important factor but the mean expression is. The modules in the right leg of the dendrogram seem to be associated to genes with low expression and the ones on the left leg to genes with high expression

heatmap.2(plot_mm, trace='none', col=brewer.pal(9,'Spectral'), dendrogram='column',
          RowSideColors = brewer.pal(9, 'RdPu')[cut(rowMeans(plot_mm), breaks=9, labels=FALSE)],
          ColSideColors = brewer.pal(9,'PuBu')[module_size$quant[order(module_size$n)]])

The biggest modules have the most extreme memberships (both positive and negative)

plot_mm = plot_mm[order(rowMeans(abs(plot_mm))), module_size$Module[order(module_size$n)]]
heatmap.2(abs(plot_mm), ColSideColors = brewer.pal(9,'PuBu')[module_size$quant[order(module_size$n)]], keysize=1, 
          RowSideColors = brewer.pal(9, 'RdPu')[cut(rowMeans(plot_mm), breaks=9, labels=FALSE)],
          trace='none', dendrogram='none', scale='none', Rowv=FALSE, Colv=FALSE, col=brewer.pal(9,'YlOrRd'))

Checking if the mean expression of the modules plays a role

plot_mm = plot_mm[order(rowMeans(plot_mm)), module_size$Module[order(module_size$meanExpr)]]
heatmap.2(plot_mm, ColSideColors = brewer.pal(9,'RdPu')[module_size$quantME[order(module_size$meanExpr)]], keysize=1, 
          RowSideColors = brewer.pal(9, 'RdPu')[cut(rowMeans(plot_mm), breaks=9, labels=FALSE)],
          trace='none', dendrogram='none', scale='none', Rowv=FALSE, Colv=FALSE, col=brewer.pal(9,'Spectral'))

plot_mm = plot_mm[order(rowMeans(plot_mm)), module_size$Module[order(module_size$meanExpr)]]
heatmap.2(plot_mm, ColSideColors = brewer.pal(9,'RdPu')[module_size$quantME[order(module_size$meanExpr)]], keysize=1, 
          RowSideColors = brewer.pal(9, 'RdPu')[cut(rowMeans(plot_mm), breaks=9, labels=FALSE)],
          trace='none', dendrogram='none', scale='none', Rowv=TRUE, Colv=FALSE, col=brewer.pal(9,'Spectral'))

The mean expression pattern seems to be clearer in the samples than in the modules

The colors in the histogram seem to have inverted, with the red being the highest MM and blue de lowest (I have no idea how this happened)

plot_mm = plot_mm[order(rowMeans(plot_mm)), module_size$Module[order(module_size$meanExpr)]]
heatmap.2(plot_mm, ColSideColors = brewer.pal(9,'RdPu')[module_size$quantME[order(module_size$meanExpr)]], keysize=1, 
          RowSideColors = brewer.pal(9, 'RdPu')[cut(rowMeans(plot_mm), breaks=9, labels=FALSE)],
          trace='none', dendrogram='column', scale='none', col=brewer.pal(9,'Spectral'))

Because of the weird inversion in the heatmap palette from above, check if there’s a relation between the gene’s level of expression and the average level of expression of the module it is assigned to: there is and it is positive, so something weird is happening in the heatmaps above

I’m not sure how to compare these two plots …

plot_data = data.frame('ID' = dataset$ID, 'GeneMeanExpr'=rowMeans(datExpr), 'Module' = gsub('#','',dataset$Module)) %>%
            left_join(module_size, by='Module') %>% mutate('ModuleMeanExpr' = meanExpr)

MM_module_size = data.frame('Module' = original_max_membership) %>% group_by(Module) %>% tally()
MM_module_size$meanExpr = sapply(MM_module_size$Module, function(m){mean(rowMeans(datExpr)[original_max_membership==m |
                                                                                           original_max_membership==paste0('#',m)])})

MM_plot_data = data.frame('ID' = dataset$ID, 'GeneMeanExpr'=rowMeans(datExpr), 'Module' = gsub('#','',dataset$Module)) %>%
            left_join(MM_module_size, by='Module') %>% mutate('ModuleMeanExpr' = meanExpr)


p1 = plot_data %>% ggplot(aes(ModuleMeanExpr, GeneMeanExpr)) + geom_point(alpha=0.2, color=gsub('#gray','gray',paste0('#',plot_data$Module))) + 
                   geom_smooth(color='gray') + theme_minimal() + xlab('Assigned Module Mean Expression') + ylab('Gene Mean Expression') +
                   ggtitle(paste0('Cor = ', round(cor(plot_data$ModuleMeanExpr, plot_data$GeneMeanExpr),3),
                                  '  Regression slope = ', round(lm(GeneMeanExpr ~ ModuleMeanExpr, data=plot_data)$coefficients[[2]],2),
                                  '  R^2 = ', round(cor(plot_data$ModuleMeanExpr, plot_data$GeneMeanExpr)^2,3)))

p2 = MM_plot_data %>% ggplot(aes(ModuleMeanExpr, GeneMeanExpr)) + geom_point(alpha=0.2, color=gsub('#gray','gray',paste0('#',plot_data$Module))) + 
                      geom_smooth(color='gray') + theme_minimal() + xlab('Highest MM Module Mean Expression') + ylab('Gene Mean Expression') +
                      ggtitle(paste0('Cor = ', round(cor(MM_plot_data$ModuleMeanExpr, MM_plot_data$GeneMeanExpr),3),
                                  '  Regression slope = ', round(lm(GeneMeanExpr ~ ModuleMeanExpr, data=MM_plot_data)$coefficients[[2]],2),
                                  '  R^2 = ', round(cor(MM_plot_data$ModuleMeanExpr, MM_plot_data$GeneMeanExpr)^2,3)))

grid.arrange(p1, p2, nrow = 1)

rm(p1, p2)

Mean expression relation by assigned module and module with the highest module membership

Assigning genes by MM increases the mean expression of the largest modules and decreases the mean expression of the smallest ones

plot_data = module_size %>% dplyr::select(Module, meanExpr, n) %>% rename(AssignedModuleME=meanExpr) %>%
            left_join(MM_module_size, by='Module') %>% rename(MMModuleME=meanExpr)

ggplotly(plot_data %>% ggplot(aes(AssignedModuleME, MMModuleME, size=n.x)) + geom_abline(slope=1, intercept=0, color='gray') +
         geom_point(color=gsub('#gray','gray',paste0('#',plot_data$Module)), alpha=0.5, aes(id=Module)) + ggtitle('Mean Expression') + 
         xlab('Mean Expression of Assigned Module') + ylab('Mean Expression of Module with Higest MM') + theme_minimal() + coord_fixed())

There doesn’t seem to be a relation between module size and level of expression, so the result from above probably has no effect on the level of expression patterns found before

plot_data = module_size %>% dplyr::select(Module, meanExpr, n)

ggplotly(plot_data %>% ggplot(aes(meanExpr, n)) + geom_point(color=gsub('#gray','gray',paste0('#',plot_data$Module))) + 
         geom_smooth(color='gray', alpha=0.1) + theme_minimal())

Module size by assigned module and module with the highest module membership

Assigning genes by MM balances more the size of the modules, increasing the smallest ones and reducing the largest ones. Mean expression doesn’t seem to be a important factor here.

plot_data = module_size %>% dplyr::select(Module, meanExpr, n) %>% rename(AssignedModuleN=n) %>%
            left_join(MM_module_size, by='Module') %>% rename(MMModuleN=n)

ggplotly(plot_data %>% ggplot(aes(AssignedModuleN, MMModuleN, size=meanExpr.x)) + geom_abline(slope=1, intercept=0, color='gray') +
         geom_smooth(color='gray', se=FALSE) + geom_point(color=gsub('#gray','gray',paste0('#',plot_data$Module)), alpha=0.5, aes(id=Module)) +
         ggtitle('Module Size') + xlab('Size of Assigned Module') + ylab('Size of Module with Higest MM') + theme_minimal())

Conclusion



Session info

sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: Scientific Linux 7.6 (Nitrogen)
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] knitr_1.24            polycor_0.7-10        expss_0.10.1         
##  [4] WGCNA_1.68            fastcluster_1.1.25    dynamicTreeCut_1.63-1
##  [7] gplots_3.0.1.1        GGally_1.4.0          gridExtra_2.3        
## [10] viridis_0.5.1         viridisLite_0.3.0     RColorBrewer_1.1-2   
## [13] dendextend_1.13.2     plotly_4.9.1          glue_1.3.1           
## [16] reshape2_1.4.3        forcats_0.4.0         stringr_1.4.0        
## [19] dplyr_0.8.3           purrr_0.3.3           readr_1.3.1          
## [22] tidyr_1.0.0           tibble_2.1.3          ggplot2_3.2.1        
## [25] tidyverse_1.3.0      
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1                backports_1.1.5            
##   [3] Hmisc_4.2-0                 plyr_1.8.5                 
##   [5] lazyeval_0.2.2              splines_3.6.0              
##   [7] crosstalk_1.0.0             BiocParallel_1.20.1        
##   [9] GenomeInfoDb_1.22.0         robust_0.4-18.2            
##  [11] digest_0.6.23               foreach_1.4.7              
##  [13] htmltools_0.4.0             GO.db_3.10.0               
##  [15] gdata_2.18.0                fansi_0.4.1                
##  [17] magrittr_1.5                checkmate_1.9.4            
##  [19] memoise_1.1.0               fit.models_0.5-14          
##  [21] cluster_2.0.8               doParallel_1.0.15          
##  [23] annotate_1.64.0             modelr_0.1.5               
##  [25] matrixStats_0.55.0          colorspace_1.4-1           
##  [27] blob_1.2.0                  rvest_0.3.5                
##  [29] rrcov_1.4-7                 haven_2.2.0                
##  [31] xfun_0.8                    crayon_1.3.4               
##  [33] RCurl_1.95-4.12             jsonlite_1.6               
##  [35] genefilter_1.68.0           zeallot_0.1.0              
##  [37] impute_1.60.0               survival_2.44-1.1          
##  [39] iterators_1.0.12            gtable_0.3.0               
##  [41] zlibbioc_1.32.0             XVector_0.26.0             
##  [43] DelayedArray_0.12.2         BiocGenerics_0.32.0        
##  [45] DEoptimR_1.0-8              scales_1.1.0               
##  [47] mvtnorm_1.0-11              DBI_1.1.0                  
##  [49] Rcpp_1.0.3                  xtable_1.8-4               
##  [51] htmlTable_1.13.1            foreign_0.8-71             
##  [53] bit_1.1-15.1                preprocessCore_1.48.0      
##  [55] Formula_1.2-3               stats4_3.6.0               
##  [57] htmlwidgets_1.5.1           httr_1.4.1                 
##  [59] ellipsis_0.3.0              acepack_1.4.1              
##  [61] farver_2.0.3                XML_3.98-1.20              
##  [63] pkgconfig_2.0.3             reshape_0.8.8              
##  [65] nnet_7.3-12                 dbplyr_1.4.2               
##  [67] locfit_1.5-9.1              later_1.0.0                
##  [69] labeling_0.3                tidyselect_0.2.5           
##  [71] rlang_0.4.2                 AnnotationDbi_1.48.0       
##  [73] munsell_0.5.0               cellranger_1.1.0           
##  [75] tools_3.6.0                 cli_2.0.1                  
##  [77] generics_0.0.2              RSQLite_2.2.0              
##  [79] broom_0.5.3                 fastmap_1.0.1              
##  [81] evaluate_0.14               yaml_2.2.0                 
##  [83] bit64_0.9-7                 fs_1.3.1                   
##  [85] robustbase_0.93-5           caTools_1.17.1.2           
##  [87] nlme_3.1-139                mime_0.8                   
##  [89] xml2_1.2.2                  compiler_3.6.0             
##  [91] rstudioapi_0.10             reprex_0.3.0               
##  [93] geneplotter_1.64.0          pcaPP_1.9-73               
##  [95] stringi_1.4.5               lattice_0.20-38            
##  [97] Matrix_1.2-17               vctrs_0.2.1                
##  [99] pillar_1.4.3                lifecycle_0.1.0            
## [101] data.table_1.12.8           bitops_1.0-6               
## [103] httpuv_1.5.2                GenomicRanges_1.38.0       
## [105] R6_2.4.1                    latticeExtra_0.6-28        
## [107] promises_1.1.0              KernSmooth_2.23-15         
## [109] IRanges_2.20.2              codetools_0.2-16           
## [111] MASS_7.3-51.4               gtools_3.8.1               
## [113] assertthat_0.2.1            SummarizedExperiment_1.16.1
## [115] DESeq2_1.26.0               withr_2.1.2                
## [117] S4Vectors_0.24.2            GenomeInfoDbData_1.2.2     
## [119] mgcv_1.8-28                 parallel_3.6.0             
## [121] hms_0.5.3                   grid_3.6.0                 
## [123] rpart_4.1-15                rmarkdown_1.14             
## [125] Cairo_1.5-10                shiny_1.4.0                
## [127] Biobase_2.46.0              lubridate_1.7.4            
## [129] base64enc_0.1-3